# First of all, import packages for data cleaning and plotting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import math
import os
from functools import reduce # to merge multiple dataframe
from scipy import stats # boxcox transform
import warnings
warnings.filterwarnings("ignore") # to ignore wwarning message
# set working directory
path = r'xxx'
os.chdir(path)
# Read dataset 'data.csv' and display basic description of this dataset
dataset = pd.read_csv('data.csv')
print('1) The number of rows and columns of this dataset: ' + str(dataset.shape))
print('2) The type of data for each column of this dataset: ', dataset.dtypes, sep='\n')
print('3) The description of this dataset: ', dataset.describe(), sep='\n')
1) For each neighborhood, find the median sale price for homes (a) Sold in 2006 (b) Indoor square footage >= 2000 sf (excluding porches, garages, decks, and veneers)
# Filter the dataset to only have the house sold in 2006 to improve the efficiency of data manipulation
df2006 = dataset[dataset['YrSold'] == 2006].copy()
#print(dataset2006.head())
# Create two new colomns respectively
# called 'PorchSF: Total porch square feet'
# and 'IndoorSF: Indoor square feet'
# Assume:
# indoor square footage =
# Above grade living area square feet (GrLivArea)
# + Total square feet of basement area (TotalBsmtSF)
# – porches (OpenProchSF+EnclosedPorch+3SsnPorch+ScreenPorch)
# – garages (GarageArea)
# – decks (WoodDeckSF)
# – veneers (MasVnrArea)
## Total porch square feet
df2006['PorchSF'] = df2006['OpenPorchSF'] + df2006['EnclosedPorch'] + df2006['3SsnPorch'] + df2006['ScreenPorch']
## Indoor square feet
df2006['IndoorSF'] = df2006['GrLivArea'] + df2006['TotalBsmtSF'] \
- df2006['PorchSF'] \
- df2006['GarageArea'] \
- df2006['WoodDeckSF'] \
- df2006['MasVnrArea']
# Get a subset of this dataset with Indoor square feet being greater than or equal to 2000 sf
# and containing only three columns for simplification
sub_df = df2006[df2006['IndoorSF']>=2000][['Neighborhood','SalePrice','IndoorSF']]
print('The number of rows and columns of this sub_dataset', sub_df.shape)
print('The number of unique values in the Neighborhood column of this sub_dataset', len(sub_df.Neighborhood.unique()))
# Group 'sub_dataset' by the 'Neighborhood' column and get the median sale price for each group
Median_SalePrice_byNeighbor = sub_df.groupby(['Neighborhood'], as_index = False).agg({'SalePrice':'median'}).rename(columns={'SalePrice':'Median_SalePrice'})
Median_SalePrice_byNeighbor = Median_SalePrice_byNeighbor.sort_values('Median_SalePrice')
print(Median_SalePrice_byNeighbor)
# Median_SalePrice_byNeighbor.to_excel('MedianSalePrice.xlsx')
2) The seasonality of the local housing market
# First of all, copy dataset for question 2, and create a new column 'Date' combining 'MoSold' and 'YrSold'
q2_df = dataset.copy()
q2_df = q2_df.groupby(['Neighborhood']).apply(lambda x: x.sort_values(['YrSold','MoSold'])).reset_index(drop=True)
q2_df['Date'] = q2_df['MoSold'].map(str)+ '-' + q2_df['YrSold'].map(str)
q2_df['Date'] = pd.to_datetime(q2_df['Date'], format='%m-%Y').dt.strftime('%m-%Y')
print(q2_df[['YrSold','MoSold','Date','Neighborhood','SalePrice']].head(10))
2.1) Time Series Analysis
# Get dataset for time series analysis
grouped_ts = q2_df.groupby(['Date','YrSold','MoSold']).agg({"SalePrice": [min, max, 'mean','median']})
grouped_ts.columns = ["_".join(x) for x in grouped_ts.columns.ravel()]
df_ts = grouped_ts.reset_index().sort_values(['YrSold','MoSold'])
# Check missing values
print('1) Check Missing Values: ', df_ts.isnull().any(),sep = '\n')
print('2) Demo of Sale Price Series: ',df_ts.head(), sep = '\n')
plt.figure()
df_ts.plot(x = 'Date', y = 'SalePrice_median', style='o-',figsize=(15,8))
plt.xlabel('Date Sold')
plt.ylabel('Median Sale Price')
plt.xticks(np.arange(len(df_ts['Date'])),df_ts.Date,rotation=90)
plt.tight_layout()
plt.savefig('df_ts.png')
plt.show()
Use Inter-Quantile Range (IQR) to check for outliers in the data of median sale price
# function to get quantiles
def q1(x):
return x.quantile(0.25)
def q3(x):
return x.quantile(0.75)
# function to check outliers for 1 and 2 methods
def outlierCheck(row,name1,name2):
if row[name1] > 0:
return 'True'
elif row[name2] > 0:
return 'True'
else:
return 'False'
df11_ts = df_ts[['Date','SalePrice_median']].copy()
stat_df = df11_ts.agg({'SalePrice_median':[q1,q3]})
IQR = (stat_df.loc['q3'] - stat_df.loc['q1']).values[0]
parameter = 1.5
lower_bound = stat_df.loc['q1'].values[0] - IQR*parameter
upper_bound = stat_df.loc['q3'].values[0] + IQR*parameter
print(lower_bound,upper_bound, sep = '\n')
df11_ts['LowerDiff'] = lower_bound - df11_ts['SalePrice_median']
df11_ts['UpperDiff'] = df11_ts['SalePrice_median'] - upper_bound
df11_ts['outlier'] = df11_ts.apply(lambda row: outlierCheck (row,'LowerDiff','UpperDiff'),axis=1)
df11_ts[df11_ts['outlier'] == 'True']
# # Replace outlier (value for date 2007-12) by observation last year in the same month
# df_ts_imputed = df_ts.copy()
# # print(df_ts_imputed[df_ts_imputed['Date']=='12-2007'])
# df_ts_imputed['Date'] = pd.to_datetime(df_ts_imputed['Date'])
# df_ts_imputed.set_index('Date', inplace=True)
# print(df_ts_imputed.loc['2007-12-01','SalePrice_median'])
# df_ts_imputed.loc['2007-12-01','SalePrice_median'] = (df_ts_imputed.loc['2006-12-01','SalePrice_median'])
# print(df_ts_imputed.loc['2007-12-01','SalePrice_median'])
df_ts_imputed = df_ts.copy()
df_ts_imputed['Date'] = pd.to_datetime(df_ts_imputed['Date'])
df_ts_imputed.set_index('Date', inplace=True)
There are many other methods that could be used to detect anomaly:
(1) Clustering-Based Anomaly Detection : k-means algorithm One way to think about the k-means model is that it places a circle (or, in higher dimensions, a hyper-sphere) at the center of each cluster, with a radius defined by the most distant point in the cluster. Alternatively, we could use Gaussian Mixture Model (GMM), which allows each cluster to be modeled as an ellipse with arbitrary orientation. Fundamentally it is an algorithm for density estimation. That is to say, the result of a GMM fit to some data is technically not a clustering model, but a generative probabilistic model describing the distribution of the data.
(2) These two disadvantages of k-means: its lack of flexibility in cluster shape and lack of probabilistic cluster assignment
from sklearn.cluster import KMeans
df_ts_imputed = df_ts.copy().set_index('Date')
df_check_outlier = df_ts_imputed[['SalePrice_median']]
n_cluster = range(1, 20)
kmeans = [KMeans(n_clusters=i).fit(df_check_outlier) for i in n_cluster]
scores = [kmeans[i].score(df_check_outlier) for i in range(len(kmeans))]
fig, ax = plt.subplots(figsize=(10,6))
ax.plot(n_cluster, scores)
ax.grid(True)
plt.xlabel('Number of Clusters')
plt.ylabel('Score')
plt.title('Elbow Curve')
plt.show()
km = KMeans(n_clusters=6)
km.fit(df_check_outlier)
km.predict(df_check_outlier)
y_kmeans = km.predict(df_check_outlier)
labels = km.labels_
# print('Clustering Labels: ' , labels, sep='\n')
centroids = km.cluster_centers_
# print('centroids: ' , centroids, sep='\n')
plt.figure(figsize=(16,12))
plt.scatter(df_check_outlier.index,df_check_outlier['SalePrice_median'], c=labels.astype(np.float), marker='X',s=200)
plt.xticks(rotation=90)
plt.grid(True)
plt.savefig('Kmeans_Outliers.png')
plt.show()
label_ls = []
center_ls = []
for i in range(len(df_check_outlier)):
date = df_check_outlier.index[i]
x_cValue = km.cluster_centers_[km.labels_[i]]
label_ls.append(km.labels_[i])
center_ls.append(x_cValue[0])
cluster_df = pd.DataFrame({'Cluster_Label': label_ls, 'Cluster_Centroid': center_ls},index = df_check_outlier.index)
full_df = pd.concat([df_check_outlier , cluster_df],axis=1)
# import seaborn as sns
# sns.set_style("whitegrid")
# plot_cluster = sns.lmplot(data = full_df.reset_index(), x='Date', y='SalePrice_median', hue='Cluster_Label',
# fit_reg=False, legend=True, legend_out=True,aspect=30/10)
# plot_cluster.set_xticklabels(rotation=90)
# def DistanceByCenter(data,model):
# distance = pd.Series()
# for i in range(len(df_check_outlier)):
# date = df_check_outlier.index[i]
# x_data = np.array(df_check_outlier.loc[date])
# x_cValue = km.cluster_centers_[km.labels_[i]]
# distance.set_value(i,abs(float(float(x_data[0])- float(x_cValue[0]))))
# return distance
# outliers_fraction = 0.01
# # # get the distance between each point and its nearest centroid. The biggest distances are considered as anomaly
# distance = DistanceByCenter(df_check_outlier, km)
# print(distance)
# number_of_outliers = int(math.ceil(outliers_fraction*len(distance)))
# threshold = distance.nlargest(number_of_outliers).min()
# print(threshold)
# # anomaly1 contain the anomaly result of the above method Cluster (0:normal, 1:anomaly)
# df_check_outlier['anomaly1'] = (distance >= threshold).astype(int)
# print(df_check_outlier)
(1) Isolation Forests For Anomaly Detection It detects anomalies purely based on the concept of isolation without employing any distance or density measure —fundamentally different from all existing methods
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings('ignore')
data = df_check_outlier[['SalePrice_median']]
scaler = StandardScaler()
np_scaled = scaler.fit_transform(data)
data = pd.DataFrame(np_scaled)
# train isolation forest
outliers_fraction = 0.01
model = IsolationForest(contamination=outliers_fraction)
model.fit(data)
df_check_outlier['anomaly_ISOF'] = model.predict(data)
# visualization
fig, ax = plt.subplots(figsize=(20,6))
a = df_check_outlier.loc[df_check_outlier['anomaly_ISOF'] == -1, 'SalePrice_median'] #anomaly
print(type(a))
ax.plot(df_check_outlier.index, df_check_outlier['SalePrice_median'], color='blue', label = 'Normal',linestyle= '--')
ax.scatter(a.index,a, color='red', label = 'Anomaly', s=200)
plt.xticks(rotation=90)
plt.grid(True)
plt.legend()
plt.savefig('IsolationForest_Outliers.png')
plt.show()
(3) OneClassSVM The idea of SVM for anomaly detection is to find a function that is positive for regions with high density of points, and negative for small densities.
from sklearn.svm import OneClassSVM
data = df_check_outlier[['SalePrice_median']]
scaler = StandardScaler()
np_scaled = scaler.fit_transform(data)
data = pd.DataFrame(np_scaled)
# train oneclassSVM
model = OneClassSVM(nu=0.04, kernel="rbf", gamma=0.01)
model.fit(data)
df_check_outlier['anomaly_SVM'] = model.predict(data)
fig, ax = plt.subplots(figsize=(15,6))
a = df_check_outlier.loc[df_check_outlier['anomaly_SVM'] == -1, 'SalePrice_median'] #anomaly
ax.plot(df_check_outlier.index, df_check_outlier['SalePrice_median'], color='blue',linestyle= '--')
ax.scatter(a.index,a, color='red')
plt.grid(True)
plt.xticks(rotation=90)
plt.show();
(3) EllipticEnvelope Gaussian distribution is also called normal distribution. We will be using the Gaussian distribution to develop an anomaly detection algorithm, that is, we’ll assume that our data are normally distributed. This’s an assumption that cannot hold true for all data sets, yet when it does, it proves an effective method for spotting outliers. Scikit-Learn’s covariance.EllipticEnvelope is a function that tries to figure out the key parameters of our data’s general distribution by assuming that our entire data is an expression of an underlying multivariate Gaussian distribution
ABOVE are methods for outlier detection
actual[:,None]]), axis=1)
maxs = np.amax(np.hstack([forecast[:,None],
actual[:,None]]), axis=1)
minmax = 1 - np.mean(mins/maxs) # minmax
acf1 = acf(fc-test)[1] # ACF1# In order to stabilize the variance,
# Add a new column for Log transformation
df_ts_imputed['Log_SalePrice_median'] = np.log(df_ts_imputed['SalePrice_median'])
# Add a new column for Boxcox transformation
df_ts_imputed['BC_SalePrice_median'],fitted_lambda = stats.boxcox(df_ts_imputed['SalePrice_median'])
print('BoxCox Transformation Parameter: ', fitted_lambda)
df_Blmngtn = df1[df1['Neighborhood']=='Blmngtn'][['Date','SalePrice_median']]
df_Blmngtn.reset_index(drop=True, inplace=True)
print(df_Blmngtn)
plt.figure()
df_Blmngtn.plot(x='Date',y='SalePrice_median', style='o-',title='Bloomington Height')
plt.xlabel('Date Sold')
# Plot time series data
fig, ax = plt.subplots(3, figsize=(15,15))
df_ts_imputed.plot(y ='SalePrice_median', use_index=True, style='o-', ax=ax[0] ,legend=False)
ax[0].set_title('Median Sale Price')
ax[0].set_xlabel('Date Sold')
df_ts_imputed.plot(y ='Log_SalePrice_median', use_index=True, style='o-', ax=ax[1] , legend=False)
ax[1].set_title('Median Sale Price - Log Transform')
ax[1].set_xlabel('Date Sold')
df_ts_imputed.plot( y ='BC_SalePrice_median', use_index=True, style='o-', ax=ax[2] , legend=False)
ax[2].set_title('Median Sale Price - Boxcox Transform')
ax[2].set_xlabel('Date Sold')
plt.show()
# fig.savefig('ts_data.png')
# Seasonal decomposition
from statsmodels.tsa.seasonal import seasonal_decompose
# Multiplicative Decomposition
result_mul = seasonal_decompose(df_ts_imputed['SalePrice_median'], model='multiplicative', extrapolate_trend='freq')
# Additive Decomposition
result_add = seasonal_decompose(df_ts_imputed['SalePrice_median'], model='additive', extrapolate_trend='freq')
# Plot multiplicative decomposition
plt.figure()
result_mul.plot()
plt.suptitle('Multiplicative Decompose', ha = 'center', va = 'top', y=1.0, fontsize=10)
plt.xlabel('Date Sold')
plt.tight_layout()
# Save figure before show
# plt.savefig('Multiplicative_Decompose.png')
plt.show()
# Plot Additive decomposition
plt.figure()
result_add.plot()
plt.suptitle('Additive Decomposition', ha = 'center', va = 'top', y=1.0, fontsize=10)
plt.xlabel('Date Sold')
plt.tight_layout()
# Save figure before show
# plt.savefig("Additive_Decompose.png")
plt.show()
The plot above clearly shows that the sale price series has obvious seasonality.
Next let's start this forecasting process by statistical test for stationarity
Stationarity is a property of a time series. A stationary series is one where the values of the series is not a function of time. That is, the statistical properties of the series like mean, variance and autocorrelation are constant over time. Autocorrelation of the series is that the correlation of the series with its previous values.
1) There are multiple implementations of Unit Root tests like:
(1.1) Augmented Dickey Fuller test (ADF Test) (It has an alternate hypothesis of linear or difference stationary) (1.2) Kwiatkowski-Phillips-Schmidt-Shin – KPSS test (trend stationary) (1.3) Philips Perron test (PP Test)
The most commonly used is the ADF test, where the null hypothesis is the time series possesses a unit root and is non-stationary. So, if the P-Value in ADF test is less than the significance level (0.05), you reject the null hypothesis. The KPSS test, on the other hand, is used to test for trend stationarity. The null hypothesis and the P-Value interpretation is just the opposite of ADF test.
If the series is not stationary, we could (a) Differencing the Series (once or more) (b) Take the log of the series (c) Take the nth root of the series (d) Combination of the above
from statsmodels.tsa.stattools import adfuller, kpss
# ADF Test
# H0: Not stationary
result = adfuller(df_ts_imputed.SalePrice_median, autolag='AIC')
print(f'ADF Statistic: {result[0]}') # The more negative this statistic, the more likely we are to reject H0, vice versa.
print(f'p-value: {result[1]}')
for key, value in result[4].items():
print('Critial Values:')
print(f'{key}, {value}') # If the test statistic is less than the critical value, we can reject H0
# KPSS Test
# H0: The process is trend stationary
result = kpss(df_ts_imputed.SalePrice_median, regression='c')
print('\nKPSS Statistic: %f' % result[0]) # If the test statistic is greater than the critical value, we reject H0
print('p-value: %f' % result[1])
for key, value in result[3].items():
print('Critial Values:')
print(f'{key}, {value}')
# From the test results, we can see, this series is stationary
Case 1: Both tests conclude that the series is not stationary -> series is not stationary Case 2: Both tests conclude that the series is stationary -> series is stationary Case 3: KPSS = stationary and ADF = not stationary -> trend stationary, remove the trend to make series strict stationary Case 4: KPSS = not stationary and ADF = stationary -> difference stationary, use differencing to make series stationary
(1) Partial autocorrelation can be imagined as the correlation between the series and its lag, after excluding the contributions from the intermediate lags. So, PACF sort of conveys the pure correlation between a lag and the series. That way, you will know if that lag is needed in the AR term or not
# Data Preprocessing (the frequency of the series)
X = df_ts_imputed[['SalePrice_median']]
X.index = pd.DatetimeIndex(X.index.values,
freq=X.index.inferred_freq)
# Plot ACF and PACF
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
fig, ax = plt.subplots(3, figsize=(12,15))
ax[0] = plot_acf(X, ax=ax[0],lags=50)
ax[1] = plot_pacf(X, ax=ax[1],lags=50)
ax[2].plot(X,linestyle= '--')
Time Series Methods: Moving Average, Simple Exponential Smoothing, Holt’s Linear Trend method, Holt-Winters Method
# Moving Average
n = 3
X['MA{}'.format(str(n))] = X['SalePrice_median'].rolling(n).mean()
# Chart
plt.figure(figsize=(15, 7))
plt.title("Moving average\n window size = {}".format(n))
plt.plot(X.SalePrice_median[n:], "g", label = "Actual")
plt.plot(X.MA3[n:], "red", label="Rolling Mean Trend")
plt.grid(True)
plt.legend(loc='best')
plt.show()
# Simple Exponential Smoothing: few data points, Irregular data, No seasonality or trend
def exponential_smoothing(data,alpha):
result = [data[0]] # first value is same as series
for n in range(1, len(data)):
result.append(alpha * data[n] + (1 - alpha) * result[n-1])
result1 = pd.DataFrame(result,index=data.index, columns = ['SES'])
return result1
alphas = [0.05, 0.3, 0.6]
plt.figure(figsize=(15, 7))
for alpha in alphas:
plt.plot(exponential_smoothing(X.SalePrice_median,alpha), label='Alpha {}'.format(alpha))
plt.plot(X.SalePrice_median, 'black',linestyle = '--', label = "Actual")
plt.legend(loc='best')
plt.axis('tight')
plt.title('Simple Exponential Smoothing')
plt.grid(True)
plt.show()
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
# Holt's Linear Smoothing: Trend in data, No seasonality.
fit1 = Holt(np.asarray(X['SalePrice_median'])).fit(smoothing_level = 0.3,smoothing_slope = 0.1)
y_hat_avg1 = pd.DataFrame(fit1.fittedvalues,index=X.index, columns = ['holt_linear'])
fit2 = ExponentialSmoothing(np.asarray(X['SalePrice_median']) ,seasonal_periods=12 ,trend='add', seasonal='add',).fit()
y_hat_avg2 = pd.DataFrame(fit2.fittedvalues, index= X.index, columns = ['holt_winter'])
plt.figure(figsize=(16,8))
plt.plot(X['SalePrice_median'],'black',linestyle = ':', marker="o",label='Actual')
# plt.plot(y_hat_avg1['holt_linear'], label='Holt linear')
plt.plot(y_hat_avg2['holt_winter'],'red' ,label='Holt Winter',linestyle = '--')
plt.legend(loc='best')
plt.savefig('holt_winter.png')
plt.show()
fit1.forecast(9)
fit2.forecast(9)
df = pd.DataFrame(np.c_[X['SalePrice_median'], fit2.level, fit2.slope, fit2.season, fit2.fittedvalues],
columns=['Obs','level','trend','seasonal','yhat'],index=X.index)
print(df.tail())
# Set up parameter combinations for grid search
# import itertools
# p = d = q = range(0, 2)
# pdq = list(itertools.product(p, d, q))
# seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]
# print(pdq,seasonal_pdq,sep='\n')
from statsmodels.tsa.statespace.sarimax import SARIMAX
# SARIMAX is an extension of the SARIMA model that also includes the modeling of exogenous variables
from sklearn.metrics import mean_squared_error
import pmdarima as pm
# Split the dataset to be training and testing sets
train_size = int(len(X) * 0.85)
train, test = X.iloc[0:train_size,:], X.iloc[train_size:,:]
# Use grid search by setting different (p,d,q)X(P,D,Q,m) combinations to find a better SARIMA model
# def evaluate_sarima_model(X, arima_order, sarima_order):
# # prepare training dataset
# train_size = int(len(X) * 0.8)
# train, test = X[0:train_size], X[train_size:]
# # make predictions
# predictions = list()
# for t in range(len(test)):
# model = SARIMAX(train, order=arima_order, seasonal_order=sarima_order, enforce_invertibility=False, enforce_stationarity=False)
# model_fit = model.fit(disp=0)
# yhat = model_fit.forecast()[0]
# predictions.append(yhat)
# # calculate out of sample error
# error = mean_squared_error(test, predictions)
# return error
# # evaluate combinations of (p, d, q)X(p,d,q,m) values for a SARIMA model
# m=12
# def evaluate_models(dataset, p_values, d_values, q_values):
# best_score, best_cfg = float("inf"), None
# for p in p_values:
# for d in d_values:
# for q in q_values:
# order = (p,d,q), order_s =(p,d,q,m)
# try:
# mse = evaluate_sarima_model(dataset, order,order_s)
# if mse < best_score:
# best_score, best_param1, best_param2 = mse, order,order_s
# print('SARIMAX: %s x %s, MSE=%.3f' % (best_param1, best_param2,best_score))
# except:
# continue
# print('Best SARIMA: %s x %s, MSE=%.3f' % (best_param1, best_param2,best_score)))
# Seasonal auto-arima to select a parameter combination with smallest AIC
auto_arima_model = pm.auto_arima(X, start_p=0, start_q=0,
test='adf', # use adftest to find optimal 'd'
max_p=3, max_q=3, # maximum p and q
m=12, # frequency of series
d=None, # let model determine 'd'
seasonal=True, # Seasonality
start_P=0,
D=1,
trace=True,
error_action='ignore',
suppress_warnings=True,
stepwise=True)
print(auto_arima_model.summary())
# Fit all the data and get fittedValues for all observations
# auto_arima_model_fit = auto_arima_model.fit(X)
# predicted = auto_arima_model_fit.predict_in_sample(X)
# future_forecast = pd.DataFrame(predicted,index = X.index,columns=['prediction'])
# df_forecast = pd.concat([X,future_forecast],axis=1)
#-----------------------------------------------------------------------------
# Fit the model using the training set and predict for testing set
auto_arima_model_fit = auto_arima_model.fit(train)
# future prediction for testing set
predicted_test = auto_arima_model_fit.predict(len(test))
predicted_test_df = pd.DataFrame(predicted_test,index = test.index,columns=['prediction'])
test_full_df = pd.concat([test,predicted_test_df],axis=1)
print(test_full_df.head())
# In sample prediction for training set
predicted_train = auto_arima_model_fit.predict_in_sample(train)
predicted_train_df = pd.DataFrame(predicted_train,index = train.index,columns=['prediction'])
train_full_df = pd.concat([train,predicted_train_df],axis=1)
print(train_full_df.head())
#
# frames = [train_full_df, test_full_df]
# result = pd.concat(frames)
result_df = train_full_df.append(test_full_df)
print(result_df.head())
# Auto-ARIMA using full dataset
# plt.figure(figsize=(16,8))
# plt.plot(df_forecast['SalePrice_median'][1:],'black',linestyle = ':', marker="o",label='Actual')
# plt.plot(df_forecast['prediction'][1:],'red' ,label='Auto-ARIMA',linestyle = '--')
# plt.legend(loc='best')
# plt.savefig('auto_arima.png')
# plt.show()
# Auto-ARIMA by training and testing set
plt.figure(figsize=(16,8))
plt.plot(result_df['SalePrice_median'][1:],'black',linestyle = ':', marker="o",label='Actual')
plt.plot(result_df['prediction'][1:],'red' ,label='Auto-ARIMA',linestyle = '--')
plt.legend(loc='best')
plt.savefig('auto_arima.png')
plt.show()
# # we can set up couple of parameter combinations for SARIMAX in 'statsmodels' package
# # and select a model with a smaller AIC or other metrics
# sarima_model = SARIMAX(train, order=(1, 2, 1), seasonal_order=(1, 1, 0, 12),
# enforce_invertibility=False, enforce_stationarity=False,trend='c')
# sarima_model_fit = sarima_model.fit()
# yhat_SARIMA = sarima_model_fit.forecast(11, alpha=0.05)
# # or equally use predict with time range
# # yhat_SARIMA = model_fit.predict(start="2009-09-01", end="2010-07-01", dynamic=True)
# sarima_model_fit.summary()
# # SARIMA model
# # Plot training set, testing set and predicted yhat for testing set
# plt.figure(figsize=(16,8))
# plt.plot(train, label='Train')
# plt.plot(sarima_model_fit.fittedvalues,label='FittedValue')
# plt.plot(test, label='Test')
# plt.plot(yhat_SARIMA, label='SARIMA')
# plt.legend(loc='best')
# plt.show()
# result_df['residuals'] = result_df['SalePrice_median'] - result_df['prediction']
# fig, ax = plt.subplots(3, figsize=(12,15))
# ax[0] = plot_acf(result_df.residuals, ax=ax[0])
# ax[1] = plot_pacf(result_df.residuals, ax=ax[1])
# ax[2].plot(result_df.residuals)
# # sarima model residuals diagnostics
# sarima_model_fit.plot_diagnostics(figsize=(17,10))
# plt.show()
2.2) Look at the Regional Markets
# Group the dataset by neighborhood and date sold
grouped = q2_df.groupby(['Neighborhood','Date','YrSold','MoSold']).agg({"SalePrice": [min, max, 'mean','median']})
grouped.columns = ["_".join(x) for x in grouped.columns.ravel()]
df = grouped.reset_index().sort_values(['Neighborhood','YrSold','MoSold'])
df1 = df[['Neighborhood','Date','SalePrice_min','SalePrice_max','SalePrice_mean','SalePrice_median']]
# The number of rows for the plot below
n_rows = math.ceil(df1.groupby('Neighborhood').ngroups/2)
# print(n_rows)
# Plot minimum sale price for each neighborhood
warnings.filterwarnings("ignore")
fig, axes = plt.subplots(figsize=(50,50),
nrows=n_rows, ncols=2, # fix as above
gridspec_kw=dict(hspace=0.4))
fig.subplots_adjust(bottom=0.08, top=0.92,left=0.1,wspace = 0.2,hspace = 0.4)
# fig.suptitle('Minimum Sale Price by Region')
axes_list = [item for sublist in axes for item in sublist]
for region_name, selection in df1.groupby('Neighborhood'):
ax = axes_list.pop(0)
#print(selection)
selection.plot(x='Date', y='SalePrice_min', style='o-',label=region_name, ax=ax, legend=False)
ax.set_title(region_name +'-'+'Min Sale Price',fontsize=25,fontweight='bold')
ax.tick_params(
which='both',
bottom='off',
left='off',
right='off',
top='off'
)
ax.grid(linewidth=0.5)
ax.set_ylabel('')
ax.set_xlabel('')
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.tick_params(axis='both', which='major', labelsize=20)
# Now use the matplotlib .remove() method to
# delete anything we didn't use
for ax in axes_list:
ax.remove()
plt.tight_layout()
plt.subplots_adjust(hspace=1)
# plt.savefig('SalePrice_min.png')
# Plot maximum sale price for each neighborhood
warnings.filterwarnings("ignore")
fig, axes = plt.subplots(figsize=(50,50),
nrows=n_rows, ncols=2, # fix as above
gridspec_kw=dict(hspace=0.4))
fig.subplots_adjust(bottom=0.08, top=0.92,left=0.1,wspace = 0.2,hspace = 0.4)
# fig.suptitle('Minimum Sale Price by Region')
axes_list = [item for sublist in axes for item in sublist]
for region_name, selection in df1.groupby('Neighborhood'):
ax = axes_list.pop(0)
#print(selection)
selection.plot(x='Date', y='SalePrice_max', style='o-',label=region_name, ax=ax, legend=False)
ax.set_title(region_name +'-'+'Max Sale Price',fontsize=25,fontweight='bold')
ax.tick_params(
which='both',
bottom='off',
left='off',
right='off',
top='off'
)
ax.grid(linewidth=0.5)
ax.set_ylabel('')
ax.set_xlabel('')
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.tick_params(axis='both', which='major', labelsize=20)
# Now use the matplotlib .remove() method to
# delete anything we didn't use
for ax in axes_list:
ax.remove()
plt.tight_layout()
plt.subplots_adjust(hspace=1)
# plt.savefig('SalePrice_max.png')
# Plot average sale price for each neighborhood
warnings.filterwarnings("ignore")
fig, axes = plt.subplots(figsize=(50,50),
nrows=n_rows, ncols=2, # fix as above
gridspec_kw=dict(hspace=0.4))
fig.subplots_adjust(bottom=0.08, top=0.92,left=0.1,wspace = 0.2,hspace = 0.4)
# fig.suptitle('Minimum Sale Price by Region')
axes_list = [item for sublist in axes for item in sublist]
for region_name, selection in df1.groupby('Neighborhood'):
ax = axes_list.pop(0)
#print(selection)
selection.plot(x='Date', y='SalePrice_mean', style='o-',label=region_name, ax=ax, legend=False)
ax.set_title(region_name +'-'+'Average Sale Price',fontsize=25,fontweight='bold')
ax.tick_params(
which='both',
bottom='off',
left='off',
right='off',
top='off'
)
ax.grid(linewidth=0.5)
ax.set_ylabel('')
ax.set_xlabel('')
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.tick_params(axis='both', which='major', labelsize=20)
# Now use the matplotlib .remove() method to
# delete anything we didn't use
for ax in axes_list:
ax.remove()
plt.tight_layout()
plt.subplots_adjust(hspace=1)
# plt.savefig('SalePrice_mean.png')
# Plot median sale price for each neighborhood
warnings.filterwarnings("ignore")
fig, axes = plt.subplots(figsize=(50,50),
nrows=n_rows, ncols=2, # fix as above
gridspec_kw=dict(hspace=0.4))
fig.subplots_adjust(bottom=0.08, top=0.92,left=0.1,wspace = 0.2,hspace = 0.4)
# fig.suptitle('Minimum Sale Price by Region')
axes_list = [item for sublist in axes for item in sublist]
for region_name, selection in df1.groupby('Neighborhood'):
ax = axes_list.pop(0)
#print(selection)
selection.plot(x='Date', y='SalePrice_median', style='o-',label=region_name, ax=ax, legend=False)
ax.set_title(region_name +'-'+'Median Sale Price',fontsize=25,fontweight='bold')
ax.tick_params(
which='both',
bottom='off',
left='off',
right='off',
top='off'
)
ax.grid(linewidth=0.5)
ax.set_ylabel('')
ax.set_xlabel('')
ax.spines['left'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.tick_params(axis='both', which='major', labelsize=20)
# Now use the matplotlib .remove() method to
# delete anything we didn't use
for ax in axes_list:
ax.remove()
plt.tight_layout()
plt.subplots_adjust(hspace=1)
# plt.savefig('SalePrice_median.png')
# Show the sub dataset for Neighborhood == 'Blmngtn'
df_Blmngtn = df1[df1['Neighborhood']=='Blmngtn'][['Date','SalePrice_median']]
df_Blmngtn.reset_index(drop=True, inplace=True)
print(df_Blmngtn)
plt.figure()
df_Blmngtn.plot(x='Date',y='SalePrice_median', style='o-',title='Bloomington Height')
plt.xlabel('Date Sold')
# plt.savefig('Blmngtn_SalePrice_median.png')
# Show the sub dataset for Neighborhood == 'SawyerW'
df_SawyerW = df1[df1['Neighborhood']=='SawyerW'][['Date','SalePrice_median']]
df_SawyerW.reset_index(drop=True, inplace=True)
print(df_SawyerW)
plt.figure()
df_SawyerW.plot(x='Date',y='SalePrice_median', style='o-',title='Sawyer West')
plt.xlabel('Date Sold')
# plt.savefig('SawyerW_SalePrice_median.png')
3) A client who wants to remodel their home and then sell it on the market, and currently the house is 3 bedroom / 2 bathroom 1500 sq.ft. There are three options for the remodel: (1) Adding a new bedroom measuring 130 sq.ft. (2) Adding a new half bathroom measuring 80 sq.ft. (3) Expanding the living room by 400 sq.ft. (the cost of all three remodel options is equal)
# Copy dataset for question 3
q3_df = dataset.copy()
# Add Porch square feet (PorchSF) and indoor square feet (IndoorSF) column
q3_df['PorchSF'] = q3_df['OpenPorchSF'] + q3_df['EnclosedPorch'] + q3_df['3SsnPorch'] + q3_df['ScreenPorch']
q3_df['IndoorSF'] = q3_df['GrLivArea'] + q3_df['TotalBsmtSF'] \
- q3_df['PorchSF'] \
- q3_df['GarageArea'] \
- q3_df['WoodDeckSF'] \
- q3_df['MasVnrArea']
print(q3_df.info())
# For simplification here, only 7 columns are selected for modeling purpose
q3_df_model = q3_df[['SalePrice','Neighborhood','LotFrontage', 'IndoorSF','FullBath','HalfBath','BedroomAbvGr']].copy()
# Drop the observations containing NAs
# (alternatively we can use imputation methods such as median to deal with missing values)
q3_df_model = q3_df_model.dropna()
q3_df_model['log_SalePrice'] = np.log(q3_df_model['SalePrice'])
print('1) The shape of the data for modeling: ', q3_df_model.shape)
print('2) A subset of the data for modeling: ', q3_df_model.head(), sep='\n')
3.2) Statistical Modeling Analysis
# Get the dataset for OLS (data_OLS): Merge original model data with dummied 'Neighborhood' variable
data_OLS = pd.concat((q3_df_model,pd.get_dummies(q3_df_model['Neighborhood'], drop_first=True)), axis=1)
# OLS regression
import statsmodels.api as sm
y = data_OLS['log_SalePrice']
X = data_OLS.drop(['SalePrice', 'Neighborhood', 'log_SalePrice'], axis=1)
X = sm.add_constant(X)
model = sm.OLS(y,X).fit()
print(model.summary())
# Save model summary to a csv file
with open('model_summary.csv', 'w') as MR:
MR.write(model.summary().as_csv())
# Althernatively, we can use sklearn to do
# Linear Regression, Decision Tree Regression, and Random Forest Regression below
from sklearn.model_selection import train_test_split
import seaborn as sns
# sns.set(style="white")
# sns.set(style="whitegrid", color_codes=True)
# plot heatmap for correlation
corr = q3_df_model.corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
f, ax = plt.subplots(figsize=(11, 9))
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
# Get array from dataframe for X and y
X = q3_df_model.iloc[:,1:7].values
y = q3_df_model.iloc[:,-1].values
q3_model_1 = q3_df_model.iloc[:,1:7].copy()
new = [('CollgCr', 80, 1630, 2,0,4),
('CollgCr', 80, 1580, 2,1,3),
('CollgCr', 80, 1900, 2,0,3)]
new_df = pd.DataFrame(new, columns = q3_model_1.columns)
q3_model_2 = q3_model_1.append(new_df, ignore_index=True)
# df32.tail()
x_new = q3_model_2.values
x_new[0]
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
warnings.filterwarnings("ignore")
labelencoder_X = LabelEncoder()
x_new[:, 0] = labelencoder_X.fit_transform(x_new[:, 0])
onehotencoder = OneHotEncoder(categorical_features = [0])
x_new = onehotencoder.fit_transform(x_new).toarray()
x_new[-1]
x_option1 = x_new[-3]
x_option2 = x_new[-2]
x_option3 = x_new[-1]
x_new.shape
col = ['coef_x_%d'%i for i in range(1,32)]
# Split data into training set and testing set
X_train, X_test, y_train, y_test = train_test_split(x_new[:1195,:],y, test_size=0.25, random_state=42)
# Linear regression in sklearn
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
regressor_LR = LinearRegression()
regressor_LR.fit(X_train, y_train)
# Predicting the testing set results with Linear Regression
y_predict_LR = regressor_LR.predict(X_test)
regressor_LASSO = Lasso(max_iter = 10000, normalize = True)
regressor_LASSO_cv = LassoCV(alphas = None, cv = 10, max_iter = 100000, normalize = True)
regressor_LASSO_cv.fit(X_train, y_train)
regressor_LASSO.set_params(alpha = regressor_LASSO_cv.alpha_)
regressor_LASSO.fit(X_train, y_train)
y_fitted_LASSO = regressor_LASSO.predict(X_train)
y_predict_LASSO = regressor_LASSO.predict(X_test)
print('Intercept: \n', regressor_LASSO.intercept_)
coef_df = pd.DataFrame(regressor_LASSO.coef_, index = col, columns = ['Coef'])
fitted_df = pd.DataFrame(y_fitted_LASSO, index = [i for i in range(len(X_train))], columns = ['FittedValue'])
coef_df[coef_df['Coef']==0]
# Decision Tree Regression
from sklearn.tree import DecisionTreeRegressor
regressor_DTR = DecisionTreeRegressor(random_state = 0)
regressor_DTR.fit(X_train, y_train)
# Predicting the testing set results with Decision Tree Regression
y_predict_DTR = regressor_DTR.predict(X_test)
# Random Forest Regression
from sklearn.ensemble import RandomForestRegressor
regressor_RFR = RandomForestRegressor(n_estimators = 300, random_state = 0)
regressor_RFR.fit(X_train, y_train)
# Predicting testing set results with Random Forest Regression
y_predict_RFR = regressor_RFR.predict(X_test)
x_option1_1 = x_option1.reshape(1, -1)
y_pred_option1 = regressor_RFR.predict(x_option1_1)
y_pred_option1
x_option2_1 = x_option2.reshape(1, -1)
y_pred_option2 = regressor_RFR.predict(x_option2_1)
y_pred_option2
x_option3_1 = x_option3.reshape(1, -1)
y_pred_option3 = regressor_RFR.predict(x_option3_1)
y_pred_option3
print('Predicted Sale Price for Option 1: ', math.exp(y_pred_option1[0]))
print('Predicted Sale Price for Option 2: ', math.exp(y_pred_option2[0]))
print('Predicted Sale Price for Option 3: ', math.exp(y_pred_option3[0]))
# Metric (Root Mean Squared Error) to compare different model performance
from sklearn.metrics import mean_squared_error
mse_LR = mean_squared_error(y_test,y_predict_LR)
rmse_LR = np.sqrt(mse_LR)
print('Root Mean Squared Error for Linear Regression',rmse_LR)
mse_DTR = mean_squared_error(y_test,y_predict_DTR)
rmse_DTR = np.sqrt(mse_DTR)
print('Root Mean Squared Error for Decision Tree Regression',rmse_DTR)
mse_RFR = mean_squared_error(y_test,y_predict_RFR)
rmse_RFR = np.sqrt(mse_RFR)
print('Root Mean Squared Error for Random Forest Regression',rmse_RFR)
3.1) Analysis from the Dataset Itself
# Combine YearSold and MonthSold to create DateSold column
q3_df = q3_df.groupby(["Neighborhood"]).apply(lambda x: x.sort_values(['YrSold','MoSold'])).reset_index(drop=True)
q3_df['Date'] = q3_df['MoSold'].map(str)+ '-' + q3_df['YrSold'].map(str)
q3_df['Date'] = pd.to_datetime(q3_df['Date'], format='%m-%Y').dt.strftime('%m-%Y')
# Option1: Adding a new bedroom measuring 130 sq.ft.
df_opt1 = q3_df[(q3_df['IndoorSF'] >= 1630) & (q3_df['BedroomAbvGr'] == 4) & (q3_df['FullBath'] == 2)] \
[['YrSold','MoSold','Date','SalePrice','IndoorSF','Neighborhood','BsmtHalfBath','HalfBath','FullBath','BedroomAbvGr']].copy()
print(df_opt1.head())
# Get aggregated dataset to check the statistics of sale price for option1
agg_opt1 = df_opt1.agg({"SalePrice": [min, max, 'mean','median']})
agg_opt1 = agg_opt1.rename(columns={'SalePrice':'SalePrice_opt1'})
# Group the option1 dataset by Month Sold
agg_mo_opt1 = df_opt1.groupby(['MoSold']).agg({"SalePrice": [min, max, 'mean','median']})
agg_mo_opt1.columns = ["_".join(x) for x in agg_mo_opt1.columns.ravel()]
agg_mo_opt1.rename(columns={'SalePrice_min':'SalePriceMin_opt1','SalePrice_max':'SalePriceMax_opt1','SalePrice_mean':'SalePriceAvg_opt1','SalePrice_median':'SalePriceMedian_opt1'}, inplace=True)
agg_mo_opt1 = agg_mo_opt1.reset_index()
print(agg_mo_opt1.head())
# Group the option1 dataset by Date Sold
grouped_NoRegion_opt1 = df_opt1.groupby(['Date','YrSold','MoSold']).agg({"SalePrice": [min, max, 'mean','median']})
grouped_NoRegion_opt1.columns = ["_".join(x) for x in grouped_NoRegion_opt1.columns.ravel()]
agg_NoRegion_opt1 = grouped_NoRegion_opt1.reset_index().sort_values(['YrSold','MoSold'])
agg_NoRegion_opt1.rename(columns={'SalePrice_min':'SalePriceMin_opt1','SalePrice_max':'SalePriceMax_opt1','SalePrice_mean':'SalePriceAvg_opt1','SalePrice_median':'SalePriceMedian_opt1'}, inplace=True)
print(agg_NoRegion_opt1.head())
# Group the option1 dataset by Neighborhood
grouped_NoDate_opt1 = df_opt1.groupby(['Neighborhood']).agg({"SalePrice": [min, max, 'mean','median']})
grouped_NoDate_opt1.columns = ["_".join(x) for x in grouped_NoDate_opt1.columns.ravel()]
agg_NoDate_opt1 = grouped_NoDate_opt1.reset_index().sort_values(['Neighborhood'])
agg_NoDate_opt1.rename(columns={'SalePrice_min':'SalePriceMin_opt1','SalePrice_max':'SalePriceMax_opt1','SalePrice_mean':'SalePriceAvg_opt1','SalePrice_median':'SalePriceMedian_opt1'}, inplace=True)
print(agg_NoDate_opt1.head())
# Option2: Adding a new half bathroom measuring 80 sq.ft.
df_opt2 = q3_df[(q3_df['IndoorSF'] >= 1580) & (q3_df['BedroomAbvGr'] == 3) & (q3_df['FullBath'] == 2)& (q3_df['HalfBath'] == 1)] \
[['YrSold','MoSold','Date','SalePrice','IndoorSF','Neighborhood','BsmtHalfBath','HalfBath','FullBath','BedroomAbvGr']].copy()
print(df_opt2.head())
# Get aggregated dataset to check the statistics of sale price for option2
agg_opt2 = df_opt2.agg({"SalePrice": [min, max, 'mean','median']})
agg_opt2 = agg_opt2.rename(columns={'SalePrice':'SalePrice_opt2'})
# Group the option2 dataset by Month Sold
agg_mo_opt2 = df_opt2.groupby(['MoSold']).agg({"SalePrice": [min, max, 'mean','median']})
agg_mo_opt2.columns = ["_".join(x) for x in agg_mo_opt2.columns.ravel()]
agg_mo_opt2.rename(columns={'SalePrice_min':'SalePriceMin_opt2','SalePrice_max':'SalePriceMax_opt2','SalePrice_mean':'SalePriceAvg_opt2','SalePrice_median':'SalePriceMedian_opt2'}, inplace=True)
agg_mo_opt2 = agg_mo_opt2.reset_index()
print(agg_mo_opt2)
# Group the option2 dataset by Date Sold
grouped_NoRegion_opt2 = df_opt2.groupby(['Date','YrSold','MoSold']).agg({"SalePrice": [min, max, 'mean','median']})
grouped_NoRegion_opt2.columns = ["_".join(x) for x in grouped_NoRegion_opt2.columns.ravel()]
agg_NoRegion_opt2 = grouped_NoRegion_opt2.reset_index().sort_values(['YrSold','MoSold'])
agg_NoRegion_opt2.rename(columns={'SalePrice_min':'SalePriceMin_opt2','SalePrice_max':'SalePriceMax_opt2','SalePrice_mean':'SalePriceAvg_opt2','SalePrice_median':'SalePriceMedian_opt2'}, inplace=True)
print(agg_NoRegion_opt2.head())
# Group the option2 dataset by Neighborhood
grouped_NoDate_opt2 = df_opt2.groupby(['Neighborhood']).agg({"SalePrice": [min, max, 'mean','median']})
grouped_NoDate_opt2.columns = ["_".join(x) for x in grouped_NoDate_opt2.columns.ravel()]
agg_NoDate_opt2 = grouped_NoDate_opt2.reset_index().sort_values(['Neighborhood'])
agg_NoDate_opt2.rename(columns={'SalePrice_min':'SalePriceMin_opt2','SalePrice_max':'SalePriceMax_opt2','SalePrice_mean':'SalePriceAvg_opt2','SalePrice_median':'SalePriceMedian_opt2'}, inplace=True)
print(agg_NoDate_opt2.head())
# Option3: Expanding the living room by 400 sq.ft.
df_opt3 = q3_df[(q3_df['IndoorSF'] >= 1900) & (q3_df['BedroomAbvGr'] == 3) & (q3_df['FullBath'] == 2)] \
[['YrSold','MoSold','Date','SalePrice','IndoorSF','Neighborhood','BsmtHalfBath','HalfBath','FullBath','BedroomAbvGr']].copy()
print(df_opt3.head())
# Get aggregated dataset to check the statistics of sale price for option3
agg_opt3 = df_opt3.agg({"SalePrice": [min, max, 'mean','median']})
agg_opt3 = agg_opt3.rename(columns={'SalePrice':'SalePrice_opt3'})
# Group the option3 dataset by Month Sold
agg_mo_opt3 = df_opt3.groupby(['MoSold']).agg({"SalePrice": [min, max, 'mean','median']})
agg_mo_opt3.columns = ["_".join(x) for x in agg_mo_opt3.columns.ravel()]
agg_mo_opt3.rename(columns={'SalePrice_min':'SalePriceMin_opt3','SalePrice_max':'SalePriceMax_opt3','SalePrice_mean':'SalePriceAvg_opt3','SalePrice_median':'SalePriceMedian_opt3'}, inplace=True)
agg_mo_opt3 = agg_mo_opt3.reset_index()
print(agg_mo_opt3.head())
# Group the option3 dataset by Date Sold
grouped_NoRegion_opt3 = df_opt3.groupby(['Date','YrSold','MoSold']).agg({"SalePrice": [min, max, 'mean','median']})
grouped_NoRegion_opt3.columns = ["_".join(x) for x in grouped_NoRegion_opt3.columns.ravel()]
agg_NoRegion_opt3 = grouped_NoRegion_opt3.reset_index().sort_values(['YrSold','MoSold'])
agg_NoRegion_opt3.rename(columns={'SalePrice_min':'SalePriceMin_opt3','SalePrice_max':'SalePriceMax_opt3','SalePrice_mean':'SalePriceAvg_opt3','SalePrice_median':'SalePriceMedian_opt3'}, inplace=True)
print(agg_NoRegion_opt3.head())
# Group the option3 dataset by Neighborhood
grouped_NoDate_opt3 = df_opt3.groupby(['Neighborhood']).agg({"SalePrice": [min, max, 'mean','median']})
grouped_NoDate_opt3.columns = ["_".join(x) for x in grouped_NoDate_opt3.columns.ravel()]
agg_NoDate_opt3 = grouped_NoDate_opt3.reset_index().sort_values(['Neighborhood'])
agg_NoDate_opt3.rename(columns={'SalePrice_min':'SalePriceMin_opt3','SalePrice_max':'SalePriceMax_opt3','SalePrice_mean':'SalePriceAvg_opt3','SalePrice_median':'SalePriceMedian_opt3'}, inplace=True)
print(agg_NoDate_opt3.head())
# Merge all general statistics summary for different options
agg_dfs = [agg_opt1,agg_opt2,agg_opt3]
merge_result = reduce(lambda left,right: pd.merge(left, right,
left_index=True, right_index=True,
how='inner'), agg_dfs)
# merge_result.to_excel('agg_option.xlsx')
# Merge dataset grouped by month sold for different options
agg_mo_dfs = [agg_mo_opt1,agg_mo_opt2,agg_mo_opt3]
merge_mo_result = reduce(lambda left,right: pd.merge(left, right,
on=['MoSold'],
how='inner'), agg_mo_dfs)
# merge_mo_result.to_excel('agg_mo_option.xlsx')
# Merge dataset grouped by neighborhood for different options
agg_re_dfs = [agg_NoDate_opt1,agg_NoDate_opt2,agg_NoDate_opt3]
merge_re_result = reduce(lambda left,right: pd.merge(left, right,
on=['Neighborhood'],
how='inner'), agg_re_dfs)
# merge_re_result.to_excel('agg_re_option.xlsx')
# Plot median sale price by neighborhood for different options
fig,(ax1, ax2, ax3) = plt.subplots(nrows=3, ncols=1,figsize=(12, 10),gridspec_kw=dict(hspace=1.0))
ax1.plot(np.arange(len(agg_NoDate_opt1['Neighborhood'])), agg_NoDate_opt1['SalePriceMedian_opt1'],'--')
ax1.xaxis.set_ticks(np.arange(len(agg_NoDate_opt1['Neighborhood'])))
ax1.xaxis.set_ticklabels(agg_NoDate_opt1['Neighborhood'], rotation=90)
ax1.set_xlabel("Neighborhood")
ax1.set_ylabel("Sale Price Median (Option1)")
ax2.plot(np.arange(len(agg_NoDate_opt2['Neighborhood'])), agg_NoDate_opt2['SalePriceMedian_opt2'],'--')
ax2.xaxis.set_ticks(np.arange(len(agg_NoDate_opt2['Neighborhood'])))
ax2.xaxis.set_ticklabels(agg_NoDate_opt2['Neighborhood'], rotation=90)
ax2.set_xlabel("Neighborhood")
ax2.set_ylabel("Sale Price Median (Option2)")
ax3.plot(np.arange(len(agg_NoDate_opt3['Neighborhood'])), agg_NoDate_opt3['SalePriceMedian_opt3'],'--')
ax3.xaxis.set_ticks(np.arange(len(agg_NoDate_opt3['Neighborhood'])))
ax3.xaxis.set_ticklabels(agg_NoDate_opt3['Neighborhood'], rotation=90)
ax3.set_xlabel("Neighborhood")
ax3.set_ylabel("Sale Price Median (Option3)")
fig.savefig('SalePrice_median_by_region.png')
# Plot median sale price by date sold for different options
fig,(ax1, ax2, ax3) = plt.subplots(nrows=3, ncols=1,figsize=(12, 10),gridspec_kw=dict(hspace=1.0))
ax1.plot(np.arange(len(agg_NoRegion_opt1['Date'])), agg_NoRegion_opt1['SalePriceMedian_opt1'],'--')
ax1.xaxis.set_ticks(np.arange(len(agg_NoRegion_opt1['Date'])))
ax1.xaxis.set_ticklabels(agg_NoRegion_opt1['Date'], rotation=90)
ax1.set_xlabel("Date Sold")
ax1.set_ylabel("Sale Price Median (Option1)")
ax2.plot(np.arange(len(agg_NoRegion_opt2['Date'])), agg_NoRegion_opt2['SalePriceMedian_opt2'],'--')
ax2.xaxis.set_ticks(np.arange(len(agg_NoRegion_opt2['Date'])))
ax2.xaxis.set_ticklabels(agg_NoRegion_opt2['Date'], rotation=90)
ax2.set_xlabel("Date Sold")
ax2.set_ylabel("Sale Price Median (Option2)")
ax3.plot(np.arange(len(agg_NoRegion_opt3['Date'])), agg_NoRegion_opt3['SalePriceMedian_opt3'],'--')
ax3.xaxis.set_ticks(np.arange(len(agg_NoRegion_opt3['Date'])))
ax3.xaxis.set_ticklabels(agg_NoRegion_opt3['Date'], rotation=90)
ax3.set_xlabel("Date Sold")
ax3.set_ylabel("Sale Price Median (Option3)")
fig.savefig('SalePrice_median_by_Date.png')
4) Own a single-family home (BldgType = '1Fam') with 4 bedrooms (assume all bedrooms are above grade) options: sell or rent (a) Convert the home into a duplex and rent both units. (b) Rent the home as is. (c) Sell the home for market value
# Copy dataset for question 4
q4_df = dataset.copy()
q4_df = q4_df.groupby(["Neighborhood"]).apply(lambda x: x.sort_values(['YrSold','MoSold'])).reset_index(drop=True)
q4_df['Date'] = q4_df['MoSold'].map(str)+ '-' + q4_df['YrSold'].map(str)
q4_df['Date'] = pd.to_datetime(q4_df['Date'], format='%m-%Y').dt.strftime('%m-%Y')
# Filter dataset for option1
df_opt1 = q4_df[(q4_df['BedroomAbvGr'] == 2) & (q4_df['BldgType'] == '1Fam')] \
[['YrSold','MoSold','Date','SalePrice','Neighborhood','BedroomAbvGr','BldgType']].copy()
print(df_opt1.head())
# Group option1 dataset by neighborhood
df_opt1_NoDate = df_opt1.groupby(['Neighborhood']).agg({"SalePrice": [min, max, 'mean','median']})
df_opt1_NoDate.columns = ["_".join(x) for x in df_opt1_NoDate.columns.ravel()]
df_opt1_NoDate = df_opt1_NoDate.reset_index().sort_values(['Neighborhood'])
df_opt1_NoDate.rename(columns={'SalePrice_min':'SalePriceMin_opt1','SalePrice_max':'SalePriceMax_opt1','SalePrice_mean':'SalePriceAvg_opt1','SalePrice_median':'SalePriceMedian_opt1'}, inplace=True)
# Only select two columns for data manipulation
df_opt1_final = df_opt1_NoDate[['Neighborhood','SalePriceMedian_opt1']]
df_opt1_final['YearlyRent_opt1'] = df_opt1_final['SalePriceMedian_opt1']*0.1*2
print(df_opt1_final.head())
df_opt1_f = df_opt1_final.copy()
df_opt1_f = df_opt1_f.set_index('Neighborhood')
# Create columns for net present value
df_opt1_f['npv_5_opt1'] = None
df_opt1_f['npv_10_opt1'] = None
df_opt1_f['npv_15_opt1'] = None
# Assume discount rate is 5%
discountRate = 0.05
# Calculate the net present value for the next 5, 10, and 15 years
for i in df_opt1_f['YearlyRent_opt1']:
cashflows_5 = [i]*5
npv_5 = np.npv(discountRate, cashflows_5)
cashflows_10 = [i]*10
npv_10 = np.npv(discountRate, cashflows_10)
cashflows_15 = [i]*15
npv_15 = np.npv(discountRate, cashflows_15)
df_opt1_f.loc[df_opt1_f['YearlyRent_opt1'] == i, 'npv_5_opt1'] = npv_5
df_opt1_f.loc[df_opt1_f['YearlyRent_opt1'] == i, 'npv_10_opt1'] = npv_10
df_opt1_f.loc[df_opt1_f['YearlyRent_opt1'] == i, 'npv_15_opt1'] = npv_15
# df_opt1_f.to_excel('q4_opt1.xlsx')
# Filter dataset for option2 by the number of bedrooms = 4 and 'BldgType' = '1Fam'
df_opt2 = q4_df[(q4_df['BedroomAbvGr'] == 4) & (q4_df['BldgType'] == '1Fam')] \
[['YrSold','MoSold','Date','SalePrice','Neighborhood','BedroomAbvGr','BldgType']].copy()
print(df_opt2.head())
# Group option2 dataset by neighborhood
df_opt2_NoDate = df_opt2.groupby(['Neighborhood']).agg({"SalePrice": [min, max, 'mean','median']})
df_opt2_NoDate.columns = ["_".join(x) for x in df_opt2_NoDate.columns.ravel()]
df_opt2_NoDate = df_opt2_NoDate.reset_index().sort_values(['Neighborhood'])
df_opt2_NoDate.rename(columns={'SalePrice_min':'SalePriceMin_opt2','SalePrice_max':'SalePriceMax_opt2','SalePrice_mean':'SalePriceAvg_opt2','SalePrice_median':'SalePriceMedian_opt2'}, inplace=True)
# Only select two columns for data manipulation
df_opt2_final = df_opt2_NoDate[['Neighborhood','SalePriceMedian_opt2']]
df_opt2_final['YearlyRent_opt2'] = df_opt2_final['SalePriceMedian_opt2']*0.1
print(df_opt2_final.head())
df_opt2_f = df_opt2_final.copy()
df_opt2_f = df_opt2_f.set_index('Neighborhood')
# Create columns for net present value
df_opt2_f['npv_5_opt2'] = None
df_opt2_f['npv_10_opt2'] = None
df_opt2_f['npv_15_opt2'] = None
# Assume discount rate is 5%
discountRate = 0.05
# Calculate the net present value for the next 5, 10, and 15 years
for i in df_opt2_f['YearlyRent_opt2']:
cashflows_5 = [i]*5
npv_5 = np.npv(discountRate, cashflows_5)
cashflows_10 = [i]*10
npv_10 = np.npv(discountRate, cashflows_10)
cashflows_15 = [i]*15
npv_15 = np.npv(discountRate, cashflows_15)
df_opt2_f.loc[df_opt2_f['YearlyRent_opt2'] == i, 'npv_5_opt2'] = npv_5
df_opt2_f.loc[df_opt2_f['YearlyRent_opt2'] == i, 'npv_10_opt2'] = npv_10
df_opt2_f.loc[df_opt2_f['YearlyRent_opt2'] == i, 'npv_15_opt2'] = npv_15
# df_opt2_f.to_excel('q4_opt2.xlsx')
# Get dataset for option3
df_opt3_f = df_opt2_NoDate[['Neighborhood','SalePriceMedian_opt2']].copy()
df_opt3_f = df_opt3_f.rename(columns={'SalePriceMedian_opt2':'Revenue_opt3'})
print(df_opt3_f.head())
# Merge all the final datasets for different options
q4_options = [df_opt1_f,df_opt2_f,df_opt3_f]
merged_result = reduce(lambda left,right: pd.merge(left, right,
on=['Neighborhood'],
how='inner'), q4_options)
print(merged_result.head())
# merged_result.to_excel('q4_option_all.xlsx')